In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
# Loading training data
training = pd.read_csv("data/kaggle-house-prices/train.csv")
training.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 non-null object
MasVnrArea       1452 non-null float64
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1423 non-null object
BsmtCond         1423 non-null object
BsmtExposure     1422 non-null object
BsmtFinType1     1423 non-null object
BsmtFinSF1       1460 non-null int64
BsmtFinType2     1422 non-null object
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
Heating          1460 non-null object
HeatingQC        1460 non-null object
CentralAir       1460 non-null object
Electrical       1459 non-null object
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int64
Functional       1460 non-null object
Fireplaces       1460 non-null int64
FireplaceQu      770 non-null object
GarageType       1379 non-null object
GarageYrBlt      1379 non-null float64
GarageFinish     1379 non-null object
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
GarageQual       1379 non-null object
GarageCond       1379 non-null object
PavedDrive       1460 non-null object
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
PoolQC           7 non-null object
Fence            281 non-null object
MiscFeature      54 non-null object
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SaleType         1460 non-null object
SaleCondition    1460 non-null object
SalePrice        1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB

In [3]:
training.sample(10)


Out[3]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
1328 1329 50 RM 60.0 10440 Pave Grvl Reg Lvl AllPub Corner Gtl OldTown Norm Norm 1Fam 1.5Fin 6 7 1920 1950 Gable CompShg BrkFace Wd Sdng None 0.0 Gd Gd BrkTil Gd TA No LwQ 493 Unf 0 1017 1510 GasW Ex Y SBrkr 1584 1208 0 2792 0 0 2 0 5 1 TA 8 Mod 2 TA Detchd 1920.0 Unf 2 520 Fa TA Y 0 547 0 0 480 0 NaN MnPrv Shed 1150 6 2008 WD Normal 256000
112 113 60 RL 77.0 9965 Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2007 2007 Gable CompShg VinylSd VinylSd Stone 220.0 Gd TA PConc Ex TA Av GLQ 984 Unf 0 280 1264 GasA Ex Y SBrkr 1282 1414 0 2696 1 0 2 1 4 1 Ex 10 Typ 1 Gd BuiltIn 2007.0 Fin 3 792 TA TA Y 120 184 0 0 168 0 NaN NaN NaN 0 10 2007 New Partial 383970
1057 1058 60 RL NaN 29959 Pave NaN IR2 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 7 6 1994 1994 Gable CompShg HdBoard HdBoard None 0.0 Gd TA PConc Gd TA No GLQ 595 Unf 0 378 973 GasA Ex Y SBrkr 979 871 0 1850 0 0 2 1 3 1 Gd 7 Typ 1 Gd BuiltIn 1994.0 Fin 2 467 TA TA Y 168 98 0 0 0 0 NaN NaN NaN 0 1 2009 WD Normal 248000
825 826 20 RL 114.0 14803 Pave NaN Reg Lvl AllPub Inside Gtl NridgHt PosN PosN 1Fam 1Story 10 5 2007 2008 Hip CompShg CemntBd CmentBd BrkFace 816.0 Ex TA PConc Ex TA Av GLQ 1636 Unf 0 442 2078 GasA Ex Y SBrkr 2084 0 0 2084 1 0 2 0 2 1 Ex 7 Typ 1 Gd Attchd 2007.0 Fin 3 1220 TA TA Y 188 45 0 0 0 0 NaN NaN NaN 0 6 2008 New Partial 385000
1234 1235 70 RH 55.0 8525 Pave NaN Reg Bnk AllPub Inside Gtl SWISU Norm Norm 1Fam 2Story 5 6 1911 1950 Gable CompShg MetalSd MetalSd None 0.0 TA TA PConc TA TA Av Unf 0 Unf 0 940 940 GasA TA N FuseA 1024 940 0 1964 0 0 1 1 4 1 TA 7 Typ 0 NaN NaN NaN NaN 0 0 NaN NaN N 0 192 0 0 0 0 NaN NaN NaN 0 11 2008 WD Abnorml 130000
1037 1038 60 RL NaN 9240 Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 8 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 396.0 Gd TA PConc Gd TA No Unf 0 Unf 0 1055 1055 GasA Ex Y SBrkr 1055 1208 0 2263 0 0 2 1 3 1 Gd 7 Typ 1 TA BuiltIn 2001.0 Fin 2 905 TA TA Y 0 45 0 0 189 0 NaN NaN NaN 0 9 2008 WD Normal 287000
1381 1382 20 RL NaN 12925 Pave NaN IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 7 1970 1970 Gable CompShg BrkFace Plywood None 0.0 TA TA CBlock TA TA Mn BLQ 865 Unf 0 340 1205 GasA Ex Y SBrkr 2117 0 0 2117 0 0 2 1 4 1 TA 7 Typ 2 Gd Attchd 1970.0 Fin 2 550 TA TA Y 0 42 0 0 0 0 NaN NaN NaN 0 5 2008 WD Normal 237500
628 629 60 RL 70.0 11606 Pave NaN IR1 HLS AllPub Inside Sev NAmes Norm Norm 1Fam 2Story 5 5 1969 1969 Gable CompShg Plywood Plywood BrkFace 192.0 TA TA PConc Gd TA Av Rec 650 Unf 0 390 1040 GasA TA Y SBrkr 1040 1040 0 2080 0 1 1 2 5 1 Fa 9 Typ 2 TA Attchd 1969.0 Unf 2 504 TA TA Y 335 0 0 0 0 0 NaN NaN NaN 0 9 2007 WD Family 135000
6 7 20 RL 75.0 10084 Pave NaN Reg Lvl AllPub Inside Gtl Somerst Norm Norm 1Fam 1Story 8 5 2004 2005 Gable CompShg VinylSd VinylSd Stone 186.0 Gd TA PConc Ex TA Av GLQ 1369 Unf 0 317 1686 GasA Ex Y SBrkr 1694 0 0 1694 1 0 2 0 3 1 Gd 7 Typ 1 Gd Attchd 2004.0 RFn 2 636 TA TA Y 255 57 0 0 0 0 NaN NaN NaN 0 8 2007 WD Normal 307000
868 869 60 RL NaN 14762 Pave NaN IR2 Lvl AllPub Corner Gtl Gilbert Feedr Norm 1Fam 2Story 5 6 1948 1950 Gable CompShg Plywood Plywood None 0.0 TA TA Slab NaN NaN NaN NaN 0 NaN 0 0 0 GasA Gd Y SBrkr 1547 720 53 2320 0 0 2 0 2 1 TA 7 Typ 1 TA Attchd 1979.0 Unf 2 672 TA TA P 120 144 0 0 0 0 NaN NaN NaN 0 5 2006 WD Normal 169000

In [4]:
SalesPrice = training.SalePrice

In [5]:
# Loading testing data
testing = pd.read_csv("data/kaggle-house-prices/test.csv")
testing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-null object
MasVnrType       1443 non-null object
MasVnrArea       1444 non-null float64
ExterQual        1459 non-null object
ExterCond        1459 non-null object
Foundation       1459 non-null object
BsmtQual         1415 non-null object
BsmtCond         1414 non-null object
BsmtExposure     1415 non-null object
BsmtFinType1     1417 non-null object
BsmtFinSF1       1458 non-null float64
BsmtFinType2     1417 non-null object
BsmtFinSF2       1458 non-null float64
BsmtUnfSF        1458 non-null float64
TotalBsmtSF      1458 non-null float64
Heating          1459 non-null object
HeatingQC        1459 non-null object
CentralAir       1459 non-null object
Electrical       1459 non-null object
1stFlrSF         1459 non-null int64
2ndFlrSF         1459 non-null int64
LowQualFinSF     1459 non-null int64
GrLivArea        1459 non-null int64
BsmtFullBath     1457 non-null float64
BsmtHalfBath     1457 non-null float64
FullBath         1459 non-null int64
HalfBath         1459 non-null int64
BedroomAbvGr     1459 non-null int64
KitchenAbvGr     1459 non-null int64
KitchenQual      1458 non-null object
TotRmsAbvGrd     1459 non-null int64
Functional       1457 non-null object
Fireplaces       1459 non-null int64
FireplaceQu      729 non-null object
GarageType       1383 non-null object
GarageYrBlt      1381 non-null float64
GarageFinish     1381 non-null object
GarageCars       1458 non-null float64
GarageArea       1458 non-null float64
GarageQual       1381 non-null object
GarageCond       1381 non-null object
PavedDrive       1459 non-null object
WoodDeckSF       1459 non-null int64
OpenPorchSF      1459 non-null int64
EnclosedPorch    1459 non-null int64
3SsnPorch        1459 non-null int64
ScreenPorch      1459 non-null int64
PoolArea         1459 non-null int64
PoolQC           3 non-null object
Fence            290 non-null object
MiscFeature      51 non-null object
MiscVal          1459 non-null int64
MoSold           1459 non-null int64
YrSold           1459 non-null int64
SaleType         1458 non-null object
SaleCondition    1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB

In [6]:
training.shape, testing.shape


Out[6]:
((1460, 81), (1459, 80))

In [7]:
# Combining training and testing data into a single dataframe
df = pd.concat([training.iloc[:, 0:80], testing], ignore_index = True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-null object
Exterior2nd      2918 non-null object
MasVnrType       2895 non-null object
MasVnrArea       2896 non-null float64
ExterQual        2919 non-null object
ExterCond        2919 non-null object
Foundation       2919 non-null object
BsmtQual         2838 non-null object
BsmtCond         2837 non-null object
BsmtExposure     2837 non-null object
BsmtFinType1     2840 non-null object
BsmtFinSF1       2918 non-null float64
BsmtFinType2     2839 non-null object
BsmtFinSF2       2918 non-null float64
BsmtUnfSF        2918 non-null float64
TotalBsmtSF      2918 non-null float64
Heating          2919 non-null object
HeatingQC        2919 non-null object
CentralAir       2919 non-null object
Electrical       2918 non-null object
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2917 non-null float64
BsmtHalfBath     2917 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
KitchenQual      2918 non-null object
TotRmsAbvGrd     2919 non-null int64
Functional       2917 non-null object
Fireplaces       2919 non-null int64
FireplaceQu      1499 non-null object
GarageType       2762 non-null object
GarageYrBlt      2760 non-null float64
GarageFinish     2760 non-null object
GarageCars       2918 non-null float64
GarageArea       2918 non-null float64
GarageQual       2760 non-null object
GarageCond       2760 non-null object
PavedDrive       2919 non-null object
WoodDeckSF       2919 non-null int64
OpenPorchSF      2919 non-null int64
EnclosedPorch    2919 non-null int64
3SsnPorch        2919 non-null int64
ScreenPorch      2919 non-null int64
PoolArea         2919 non-null int64
PoolQC           10 non-null object
Fence            571 non-null object
MiscFeature      105 non-null object
MiscVal          2919 non-null int64
MoSold           2919 non-null int64
YrSold           2919 non-null int64
SaleType         2918 non-null object
SaleCondition    2919 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 1.8+ MB

In [8]:
print("Training: ", training.shape)
print("Testing: ", testing.shape)
print("Combined: ", df.shape)


Training:  (1460, 81)
Testing:  (1459, 80)
Combined:  (2919, 80)

Loaded both datasets and combined them into a single dataset for data preprocessing. When we are ready for modeling prepearion, we will split them into 2 datasets.

Amount of NA values


In [9]:
df.isnull().sum().sum() / (df.shape[0] * df.shape[1])


Out[9]:
0.0598021582733813

Close 6% of data points across entire dataset is NA. Find out which columns have null in them


In [10]:
def find_null_columns(): 
    null_columns = df.isnull().sum().sort_values(ascending = False)
    null_columns = null_columns[null_columns > 0]
    print("Total number of records: ", df.shape[0])
    print("No of columns with null values: ", len(null_columns))
    print("\nMissing values per column:")
    print(null_columns)
    
find_null_columns()


Total number of records:  2919
No of columns with null values:  34

Missing values per column:
PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageCond       159
GarageQual       159
GarageYrBlt      159
GarageFinish     159
GarageType       157
BsmtCond          82
BsmtExposure      82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
Utilities          2
Functional         2
BsmtFullBath       2
BsmtFinSF1         1
Exterior1st        1
Exterior2nd        1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
SaleType           1
Electrical         1
KitchenQual        1
GarageArea         1
GarageCars         1
dtype: int64

There are 34 columns that contain null values. Let's analyze each of these columns one by one in the order of null occurrences.

PoolQC

How many column names contain "pool" - likely that these are pool data related.


In [11]:
def find_columns(df, search):
    matched = [c for c in df.columns if search.lower() in c.lower()]
    return matched

find_columns(df, "pool")


Out[11]:
['PoolArea', 'PoolQC']

From the field types described above, we can see PoolArea is numeric and PoolQC is categorical. Find out where are any records where PoolArea is mentioned but PoolQC is null.


In [12]:
df.query("PoolArea >= 0 and PoolQC != PoolQC").shape


Out[12]:
(2909, 80)

Out of these, how many have >0 PoolArea?


In [13]:
df.query("PoolArea > 0 and PoolQC != PoolQC")[['PoolArea', 'PoolQC']]


Out[13]:
PoolArea PoolQC
2420 368 NaN
2503 444 NaN
2599 561 NaN

We can impute the values of PoolQC, by looking at PoolArea value. Find out how many diff values are there for PoolQC and their freq and mean PoolArea


In [14]:
df.groupby("PoolQC").PoolArea.agg([np.mean, len])


Out[14]:
mean len
PoolQC
Ex 359.75 4
Fa 583.50 2
Gd 648.50 4

So, we can input Ex, Ex, Fa respectively so that PoolArea values are closer the mean values for the respective PoolQC. Replace the rest of NA values under PoolQC with "None".


In [15]:
df.loc[2420,'PoolQC'] = "Ex"
df.loc[2503,'PoolQC'] = "Ex"
df.loc[2599,'PoolQC'] = "Fa"
idx = df.query("PoolArea == 0 and PoolQC != PoolQC").index
df.loc[idx, 'PoolQC'] = "None"
df.query("PoolArea >= 0 and PoolQC != PoolQC").shape


Out[15]:
(0, 80)

Garage Fields

Find garage fields


In [16]:
garage_cols = find_columns(df, "garage")
df[garage_cols].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 7 columns):
GarageType      2762 non-null object
GarageYrBlt     2760 non-null float64
GarageFinish    2760 non-null object
GarageCars      2918 non-null float64
GarageArea      2918 non-null float64
GarageQual      2760 non-null object
GarageCond      2760 non-null object
dtypes: float64(3), object(4)
memory usage: 159.7+ KB

Lets see what the deal is with GarageYrBlt. It seems reasonable that most houses would build a garage when the house itself was built. We can check this by seeing how many houses were built the same year their garage was built.


In [17]:
(df.GarageYrBlt == df.YearBuilt).value_counts()


Out[17]:
True     2216
False     703
dtype: int64

2216 of the 2919 houses have same year for for GarageYrBlt and YearBuilt. Lets replace any of the NA’s for GarageYrBlt with the year from YearBuilt.


In [18]:
df.query("GarageYrBlt != GarageYrBlt").shape


Out[18]:
(159, 80)

In [19]:
idx = df.query("GarageYrBlt != GarageYrBlt").index
df.loc[idx, "GarageYrBlt"] = df.loc[idx, "YearBuilt"]
(df.GarageYrBlt == df.YearBuilt).value_counts()


Out[19]:
True     2375
False     544
dtype: int64

In [20]:
pd.isnull(df.GarageYrBlt).value_counts()


Out[20]:
False    2919
Name: GarageYrBlt, dtype: int64

So, there is no more NA in the GarageYrBlt field.

Let's look at top 10 values of GarageYrBlt and top 10 YearBuilt values to to find any anamolies


In [21]:
print(df.GarageYrBlt.sort_values(ascending = False)[:10])
print(df.YearBuilt.sort_values(ascending = False)[:10])


2592    2207.0
378     2010.0
1502    2010.0
819     2010.0
1608    2010.0
987     2010.0
914     2009.0
572     2009.0
1661    2009.0
1711    2009.0
Name: GarageYrBlt, dtype: float64
378     2010
1576    2010
1502    2010
507     2009
898     2009
762     2009
87      2009
819     2009
103     2009
1608    2009
Name: YearBuilt, dtype: int64

In [22]:
df[df.GarageYrBlt == 2207][["GarageYrBlt", "YearBuilt"]]


Out[22]:
GarageYrBlt YearBuilt
2592 2207.0 2006

Replace GarageYrBlt with YearBuild for record 2593


In [23]:
df.loc[2592, "GarageYrBlt"] = 2006

Plot histogram of GarageYrBlt


In [24]:
df.GarageYrBlt.plot.hist(bins = 100)
plt.title("Histogram by GarageYrBlt")
plt.xlabel("GarageYrBlt")


Out[24]:
<matplotlib.text.Text at 0x116c18828>

Plot histogram of YearBuilt


In [25]:
df.YearBuilt.plot.hist(bins = 100)
plt.title("Histogram by YearBuilt")
plt.xlabel("YearBuilt")


Out[25]:
<matplotlib.text.Text at 0x11a59d470>

Let's see values at lower end to find any anamolies


In [26]:
print(df.GarageYrBlt.sort_values(ascending = True)[:10])
print(df.YearBuilt.sort_values(ascending = True)[:10])


1349    1872.0
1137    1875.0
2422    1890.0
2096    1890.0
2426    1895.0
2579    1895.0
2217    1895.0
1602    1896.0
1856    1900.0
1530    1900.0
Name: GarageYrBlt, dtype: float64
1349    1872
1137    1875
2451    1879
747     1880
1132    1880
1856    1880
630     1880
304     1880
991     1882
1416    1885
Name: YearBuilt, dtype: int64

At the lower values there is no obvious anamoly.

That leaves 6 garage features in our dataset and 4 of them have at least 157 missing values while GarageArea and GarageCars both only have 1, thus we can assume this particular house does not have a garage at all. For the rest of the houses we can check to see that if the NA’s recorded also have 0 GarageArea and 0 GarageCars. If they do we can fill in their missing values with ‘None’ since having 0 area and 0 cars in their garage will imply that they do not have any at all.


In [27]:
df[df.GarageCond.isnull()][garage_cols].query("GarageArea > 0")


Out[27]:
GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond
2126 Detchd 1910.0 NaN 1.0 360.0 NaN NaN

Only one house who had NA’s in their garage columns had an area graeteer than 0. We can fill this house in manually and set the rest of the houses NA’s to 0.

For the house with GarageArea = 360 and GarageCars = 1, but NA’s in the other columns, we can use the most frequent values for each columns from houses with a similar area and car count.


In [28]:
idx = df.query("GarageArea >= 350 and GarageArea <= 370").index
df2 = df.loc[idx, garage_cols]
df2


Out[28]:
GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond
12 Detchd 1962.0 Unf 1.0 352.0 TA TA
14 Attchd 1960.0 RFn 1.0 352.0 TA TA
59 Detchd 1973.0 Unf 1.0 352.0 TA TA
71 Detchd 1987.0 Unf 2.0 360.0 TA TA
98 Basment 1920.0 Unf 1.0 366.0 Fa TA
194 Detchd 1989.0 Unf 1.0 352.0 TA TA
242 Attchd 1940.0 Unf 1.0 352.0 Fa TA
325 Detchd 1941.0 Unf 2.0 360.0 TA TA
330 Detchd 2002.0 Unf 1.0 352.0 TA TA
383 Detchd 1950.0 Unf 2.0 360.0 Fa Fa
387 Detchd 1977.0 Unf 1.0 352.0 TA TA
410 Attchd 1958.0 Unf 1.0 350.0 TA TA
413 Detchd 1927.0 Unf 2.0 360.0 TA TA
427 Detchd 1964.0 Unf 1.0 352.0 TA TA
438 Detchd 1990.0 Unf 1.0 352.0 Gd TA
442 Detchd 1930.0 Unf 2.0 360.0 TA TA
459 Detchd 1950.0 Unf 1.0 352.0 TA TA
462 Detchd 1965.0 Unf 1.0 360.0 TA TA
508 Detchd 1928.0 Unf 2.0 360.0 TA TA
543 Attchd 1998.0 Fin 1.0 367.0 TA TA
579 Attchd 1954.0 Unf 1.0 352.0 Fa TA
627 Basment 1955.0 RFn 1.0 364.0 TA TA
740 Detchd 1910.0 Unf 2.0 360.0 TA Gd
809 Detchd 1910.0 Unf 2.0 360.0 Fa Po
838 Attchd 1999.0 Unf 1.0 352.0 TA TA
848 Detchd 1908.0 Unf 2.0 360.0 Fa TA
893 Attchd 1954.0 Unf 1.0 354.0 TA TA
996 Attchd 1961.0 Unf 1.0 368.0 TA TA
1053 Basment 1957.0 Unf 1.0 364.0 TA TA
1170 Attchd 1977.0 Fin 1.0 358.0 TA TA
... ... ... ... ... ... ... ...
1589 Detchd 1975.0 Unf 1.0 352.0 TA TA
1617 Detchd 1971.0 Unf 1.0 352.0 TA TA
1762 BuiltIn 1961.0 Fin 2.0 368.0 TA TA
1771 Attchd 1955.0 Unf 1.0 350.0 TA TA
1781 Attchd 1958.0 Unf 1.0 366.0 Fa TA
1791 Attchd 1963.0 Unf 1.0 364.0 TA TA
1817 Detchd 1930.0 Unf 1.0 357.0 TA TA
1917 Attchd 1977.0 Unf 1.0 351.0 TA TA
1962 Detchd 1975.0 Unf 1.0 352.0 TA TA
2083 Attchd 1954.0 Unf 1.0 350.0 TA TA
2115 Detchd 1924.0 Unf 2.0 370.0 TA TA
2119 BuiltIn 1920.0 RFn 1.0 355.0 Fa TA
2126 Detchd 1910.0 NaN 1.0 360.0 NaN NaN
2199 Detchd 2001.0 Unf 1.0 369.0 TA TA
2214 Detchd 1934.0 RFn 1.0 360.0 TA TA
2215 Attchd 1958.0 Fin 1.0 356.0 TA TA
2278 Attchd 1970.0 RFn 1.0 368.0 TA Gd
2285 Detchd 1974.0 Unf 1.0 360.0 TA TA
2315 BuiltIn 2007.0 Unf 3.0 350.0 TA TA
2418 Detchd 2002.0 Unf 1.0 352.0 TA TA
2430 Attchd 1950.0 Unf 1.0 357.0 TA TA
2434 Attchd 1965.0 Unf 1.0 351.0 TA TA
2523 Detchd 1979.0 Unf 2.0 370.0 TA TA
2555 Detchd 1959.0 Unf 1.0 355.0 TA TA
2558 Detchd 1973.0 Unf 1.0 352.0 TA TA
2587 Attchd 1975.0 Unf 1.0 360.0 TA TA
2743 Attchd 1956.0 Fin 1.0 353.0 TA TA
2797 Detchd 1924.0 Unf 2.0 370.0 Fa Fa
2860 Attchd 1969.0 Unf 1.0 360.0 TA TA
2884 Attchd 1930.0 Unf 2.0 365.0 Fa TA

67 rows × 7 columns

Find most common values for each column


In [29]:
def mostFrequent(col):
    counts = col.value_counts()
    counts = counts.sort_values(ascending = False)
    return  counts.index[0]

df2.apply(mostFrequent, axis = 0)


Out[29]:
GarageType      Detchd
GarageYrBlt       1950
GarageFinish       Unf
GarageCars           1
GarageArea         352
GarageQual          TA
GarageCond          TA
dtype: object

In [30]:
df.loc[2126, "GarageQual"] = "TA"
df.loc[2126, "GarageFinish"] = "Unf"
df.loc[2126, "GarageCond"] = "TA"

Now we can fill in any missing numeric values with 0 and categoric with ‘None’ since these houses recorded having 0 area and 0 cars in their garage.


In [31]:
pd.isnull(df[garage_cols]).sum()


Out[31]:
GarageType      157
GarageYrBlt       0
GarageFinish    158
GarageCars        1
GarageArea        1
GarageQual      158
GarageCond      158
dtype: int64

In [32]:
garage_cols


Out[32]:
['GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond']

In [33]:
def replace_with(df, col, fill_with):
    idx = df[df[col].isnull()].index
    df.loc[idx, col] = fill_with
    
for c in ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]:
    replace_with(df, c, "None")

for c in ["GarageCars", "GarageArea"]:
    replace_with(df, c, 0.0)

pd.isnull(df[garage_cols]).sum()


Out[33]:
GarageType      0
GarageYrBlt     0
GarageFinish    0
GarageCars      0
GarageArea      0
GarageQual      0
GarageCond      0
dtype: int64

KitchenQual and Electrical

With only 1 missing value for KitchenQual and Electrical each we can fill in the missing value with the most frequent value from each column.


In [34]:
df[find_columns(df, "kitchen")].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 2 columns):
KitchenAbvGr    2919 non-null int64
KitchenQual     2918 non-null object
dtypes: int64(1), object(1)
memory usage: 45.7+ KB

In [35]:
df.KitchenQual.value_counts()


Out[35]:
TA    1492
Gd    1151
Ex     205
Fa      70
Name: KitchenQual, dtype: int64

In [36]:
replace_with(df, "KitchenQual", "TA")

In [37]:
df[find_columns(df, "electrical")].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 1 columns):
Electrical    2918 non-null object
dtypes: object(1)
memory usage: 22.9+ KB

In [38]:
df.Electrical.value_counts()


Out[38]:
SBrkr    2671
FuseA     188
FuseF      50
FuseP       8
Mix         1
Name: Electrical, dtype: int64

In [39]:
replace_with(df, "Electrical", "SBrkr")
df[find_columns(df, "electrical")].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 1 columns):
Electrical    2919 non-null object
dtypes: object(1)
memory usage: 22.9+ KB

Basement


In [40]:
basement_columns = find_columns(df, "bsmt")
df[basement_columns].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 11 columns):
BsmtQual        2838 non-null object
BsmtCond        2837 non-null object
BsmtExposure    2837 non-null object
BsmtFinType1    2840 non-null object
BsmtFinSF1      2918 non-null float64
BsmtFinType2    2839 non-null object
BsmtFinSF2      2918 non-null float64
BsmtUnfSF       2918 non-null float64
TotalBsmtSF     2918 non-null float64
BsmtFullBath    2917 non-null float64
BsmtHalfBath    2917 non-null float64
dtypes: float64(6), object(5)
memory usage: 250.9+ KB

From the documetation below are the description of these fields.

  • BsmtQual: Height of the basement
  • BsmtCond: General condition of the basement
  • BsmtExposure: Walkout or garden level basement walls
  • BsmtFinType1: Quality of basement finished area
  • BsmtFinSF1: Type 1 finished square feet
  • BsmtFinType2: Quality of second finished area (if present)
  • BsmtFinSF2: Type 2 finished square feet
  • BsmtUnfSF: Unfinished square feet of basement area
  • TotalBsmtSF: Total square feet of basement area
  • BsmtFullBath: Basement full bathrooms
  • BsmtHalfBath: Basement half bathrooms

There are 11 basement features each with at least 1 missing value. We can take a look at the subset of just these columns from our data.


In [41]:
df[df.BsmtExposure.isnull()][basement_columns]


Out[41]:
BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath
17 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
39 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
90 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
102 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
156 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
182 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
259 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
342 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
362 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
371 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
392 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
520 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
532 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
533 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
553 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
646 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
705 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
736 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
749 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
778 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
868 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
894 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
897 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
948 Gd TA NaN Unf 0.0 Unf 0.0 936.0 936.0 0.0 0.0
984 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
1000 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
1011 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
1035 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
1045 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
1048 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ...
2066 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2068 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2120 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2122 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2188 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 NaN NaN
2189 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2190 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2193 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2216 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2224 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2348 Gd TA NaN Unf 0.0 Unf 0.0 725.0 725.0 0.0 0.0
2387 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2435 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2452 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2453 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2490 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2498 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2547 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2552 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2564 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2578 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2599 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2702 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2763 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2766 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2803 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2804 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2824 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2891 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0
2904 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0

82 rows × 11 columns


In [42]:
df.BsmtExposure.value_counts()


Out[42]:
No    1904
Av     418
Gd     276
Mn     239
Name: BsmtExposure, dtype: int64

Almost all of the missing values for each categoric basement feature comes from houses with 0 on each features corresponding to area. We can fill in these values with ‘None’ since these houses certainly don’t have basements. Rows 949, 1488 and 2349 are the only missing values from BsmtExposure, we can fill this with No as that is the most frequent value and these houses most likely don’t have any exposure for their basements. The rest of the basement columns corresponding to area will be filled with 0 since they likely don’t have a basement and the categoric missing values will be filled with NoBsmt.


In [43]:
df.loc[[949, 1488, 2349], 'BsmtExposure'] = 'No'

In [44]:
df[basement_columns].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 11 columns):
BsmtQual        2838 non-null object
BsmtCond        2837 non-null object
BsmtExposure    2837 non-null object
BsmtFinType1    2840 non-null object
BsmtFinSF1      2918 non-null float64
BsmtFinType2    2839 non-null object
BsmtFinSF2      2918 non-null float64
BsmtUnfSF       2918 non-null float64
TotalBsmtSF     2918 non-null float64
BsmtFullBath    2917 non-null float64
BsmtHalfBath    2917 non-null float64
dtypes: float64(6), object(5)
memory usage: 250.9+ KB

In [45]:
for c in ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"]:
    replace_with(df, c, "None")

for c in ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
          "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath"]:
    replace_with(df, c, 0.0)
    
df[basement_columns].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 11 columns):
BsmtQual        2919 non-null object
BsmtCond        2919 non-null object
BsmtExposure    2919 non-null object
BsmtFinType1    2919 non-null object
BsmtFinSF1      2919 non-null float64
BsmtFinType2    2919 non-null object
BsmtFinSF2      2919 non-null float64
BsmtUnfSF       2919 non-null float64
TotalBsmtSF     2919 non-null float64
BsmtFullBath    2919 non-null float64
BsmtHalfBath    2919 non-null float64
dtypes: float64(6), object(5)
memory usage: 250.9+ KB

In [46]:
find_null_columns()


Total number of records:  2919
No of columns with null values:  13

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
LotFrontage     486
MasVnrType       24
MasVnrArea       23
MSZoning          4
Utilities         2
Functional        2
Exterior2nd       1
Exterior1st       1
SaleType          1
dtype: int64

Exterior


In [47]:
exterior_columns = find_columns(df, "exterior")
exterior_columns


Out[47]:
['Exterior1st', 'Exterior2nd']

In [48]:
df.query("Exterior1st != Exterior1st or Exterior2nd != Exterior2nd")[exterior_columns]


Out[48]:
Exterior1st Exterior2nd
2151 NaN NaN

In [49]:
df.loc[2151, "Exterior1st"] = "Other"
df.loc[2151, "Exterior2nd"] = "Other"

In [50]:
find_null_columns()


Total number of records:  2919
No of columns with null values:  11

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
LotFrontage     486
MasVnrType       24
MasVnrArea       23
MSZoning          4
Utilities         2
Functional        2
SaleType          1
dtype: int64

Sale Type

SaleType, Functional and Utilities have less than 3 missing values. For SaleType we can see what the SaleCondition of the house was and use a contingency table to see which SaleType and SaleCondition overlap together the most.


In [51]:
sale_columns = find_columns(df, "sale")
sale_columns


Out[51]:
['SaleType', 'SaleCondition']

In [52]:
df[df.SaleType.isnull()][sale_columns]


Out[52]:
SaleType SaleCondition
2489 NaN Normal

In [53]:
df.groupby(["SaleType", "SaleCondition"]).Id.count().unstack()


Out[53]:
SaleCondition Abnorml AdjLand Alloca Family Normal Partial
SaleType
COD 46.0 NaN NaN 2.0 39.0 NaN
CWD 1.0 NaN NaN 1.0 10.0 NaN
Con NaN NaN NaN NaN 4.0 1.0
ConLD 3.0 NaN NaN 1.0 21.0 1.0
ConLI 2.0 NaN NaN 2.0 5.0 NaN
ConLw NaN NaN NaN 1.0 7.0 NaN
New NaN NaN NaN NaN NaN 239.0
Oth 5.0 NaN NaN 1.0 1.0 NaN
WD 133.0 12.0 24.0 38.0 2314.0 4.0

In [54]:
df.loc[2489, "SaleType"] = "WD"
find_null_columns()


Total number of records:  2919
No of columns with null values:  10

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
LotFrontage     486
MasVnrType       24
MasVnrArea       23
MSZoning          4
Utilities         2
Functional        2
dtype: int64

Functional

There is no field that could help us replace the NA value for Functional. So let replace NA with most freq value.


In [55]:
functional_columns = find_columns(df, "func")
functional_columns


Out[55]:
['Functional']

In [56]:
df.Functional.value_counts()


Out[56]:
Typ     2717
Min2      70
Min1      65
Mod       35
Maj1      19
Maj2       9
Sev        2
Name: Functional, dtype: int64

In [57]:
replace_with(df, "Functional", "Typ")

In [58]:
find_null_columns()


Total number of records:  2919
No of columns with null values:  9

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
LotFrontage     486
MasVnrType       24
MasVnrArea       23
MSZoning          4
Utilities         2
dtype: int64

Utilities

Utilities only has 1 value for NoSeWa and the rest AllPub. We can drop this feature from our dataset as the house with ‘NoSeWa’ is from our training set and will have won’t help with any predictive modelling


In [59]:
df.Utilities.value_counts()


Out[59]:
AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64

In [60]:
del df["Utilities"]
find_null_columns()


Total number of records:  2919
No of columns with null values:  8

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
LotFrontage     486
MasVnrType       24
MasVnrArea       23
MSZoning          4
dtype: int64

In [ ]:

MSZoning

MSZoning: The general zoning classification MSSubClass: The building class

There are only 4 missing values for MSZoning. We can see what the subclass is for the houses with missing values for Zoning.


In [61]:
zoning_columns = ["MSZoning", "MSSubClass"]
df[df.MSZoning.isnull()][zoning_columns]


Out[61]:
MSZoning MSSubClass
1915 NaN 30
2216 NaN 20
2250 NaN 70
2904 NaN 20

Find the most frequent corresponding subclass


In [62]:
df.groupby(["MSSubClass", "MSZoning"]).Id.count().unstack()


Out[62]:
MSZoning C (all) FV RH RL RM
MSSubClass
20 3.0 34.0 4.0 1016.0 20.0
30 8.0 NaN 2.0 61.0 67.0
40 NaN NaN NaN 4.0 2.0
45 NaN NaN 1.0 6.0 11.0
50 7.0 NaN 2.0 159.0 119.0
60 NaN 43.0 NaN 529.0 3.0
70 4.0 NaN 3.0 57.0 63.0
75 NaN NaN NaN 9.0 14.0
80 NaN NaN NaN 115.0 3.0
85 NaN NaN NaN 47.0 1.0
90 NaN NaN 4.0 92.0 13.0
120 NaN 19.0 6.0 117.0 40.0
150 NaN NaN NaN 1.0 NaN
160 NaN 43.0 NaN 21.0 64.0
180 NaN NaN NaN NaN 17.0
190 3.0 NaN 4.0 31.0 23.0

In [63]:
df.loc[1915, "MSZoning"] = "RM"
df.loc[2216, "MSZoning"] = "RL"
df.loc[2250, "MSZoning"] = "RM"
df.loc[2904, "MSZoning"] = "RL"

In [64]:
find_null_columns()


Total number of records:  2919
No of columns with null values:  7

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
LotFrontage     486
MasVnrType       24
MasVnrArea       23
dtype: int64

Masonry

MasVnrType: Masonry veneer type MasVnrArea: Masonry veneer area in square feet

There are 23 missing values for MasVnrArea and 24 for MasVnrType. We can see if both missing values come from the same houses


In [65]:
masonary_columns = find_columns(df, "mas")
masonary_columns


Out[65]:
['MasVnrType', 'MasVnrArea']

In [66]:
df[df.MasVnrType.isnull() | df.MasVnrArea.isnull()][masonary_columns]


Out[66]:
MasVnrType MasVnrArea
234 NaN NaN
529 NaN NaN
650 NaN NaN
936 NaN NaN
973 NaN NaN
977 NaN NaN
1243 NaN NaN
1278 NaN NaN
1691 NaN NaN
1706 NaN NaN
1882 NaN NaN
1992 NaN NaN
2004 NaN NaN
2041 NaN NaN
2311 NaN NaN
2325 NaN NaN
2340 NaN NaN
2349 NaN NaN
2368 NaN NaN
2592 NaN NaN
2610 NaN 198.0
2657 NaN NaN
2686 NaN NaN
2862 NaN NaN

All but one house has missing values for both columns. For houses with NA’s on both columns we can fill 0 for the area and None for the type since they likely do not have a masonry veneer. For the house with a MasVnrArea of 198 but NA for MasVnrType we can record the median areas for each type and see which type is closest to 198.


In [67]:
df.groupby("MasVnrType").MasVnrArea.agg([len, np.median])


Out[67]:
len median
MasVnrType
BrkCmn 25.0 161.0
BrkFace 879.0 203.0
None 1742.0 0.0
Stone 249.0 200.0

In [68]:
df.loc[2610, "MasVnrType"] = "Stone"
replace_with(df, "MasVnrType", "None")
replace_with(df, "MasVnrArea", 0)

find_null_columns()


Total number of records:  2919
No of columns with null values:  5

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
LotFrontage     486
dtype: int64

LotFrontage: Linear feet of street connected to property

There are 486 missing values for LotFrontage, which is quite a lot of values to fill and we can’t just replace these with 0. We’re given that “LotFrontage: Linear feet of street connected to property.” The area of each street connected to the house property is most likely going to have a similar area to other houses in its neighborhood. We can group by each neighborhood and take the median of each LotFrontage and fill the missing values of each LotFrontage based on what neighborhood the house comes from.


In [69]:
lot_columns = find_columns(df, "lot")
lot_columns.append("Neighborhood")
lot_columns


Out[69]:
['LotFrontage', 'LotArea', 'LotShape', 'LotConfig', 'Neighborhood']

In [70]:
neighborhood = df.groupby("Neighborhood").LotFrontage.agg([len, np.median])
neighborhood


Out[70]:
len median
Neighborhood
Blmngtn 28.0 43.0
Blueste 10.0 24.0
BrDale 30.0 21.0
BrkSide 108.0 51.0
ClearCr 44.0 80.5
CollgCr 267.0 70.0
Crawfor 103.0 70.0
Edwards 194.0 65.0
Gilbert 165.0 64.0
IDOTRR 93.0 60.0
MeadowV 37.0 21.0
Mitchel 114.0 74.0
NAmes 443.0 73.0
NPkVill 23.0 24.0
NWAmes 131.0 80.0
NoRidge 71.0 89.0
NridgHt 166.0 92.0
OldTown 239.0 60.0
SWISU 48.0 60.0
Sawyer 151.0 72.0
SawyerW 125.0 67.0
Somerst 182.0 72.5
StoneBr 51.0 60.0
Timber 72.0 82.0
Veenker 24.0 80.0

In [71]:
df2 = df[lot_columns].merge(neighborhood, 
                      left_on="Neighborhood", right_index=True)
df2


Out[71]:
LotFrontage LotArea LotShape LotConfig Neighborhood len median
0 65.0 8450 Reg Inside CollgCr 267.0 70.0
2 68.0 11250 IR1 Inside CollgCr 267.0 70.0
13 91.0 10652 IR1 Inside CollgCr 267.0 70.0
22 75.0 9742 Reg Inside CollgCr 267.0 70.0
32 85.0 11049 Reg Corner CollgCr 267.0 70.0
36 112.0 10859 Reg Corner CollgCr 267.0 70.0
43 NaN 9200 IR1 CulDSac CollgCr 267.0 70.0
57 89.0 11645 IR1 Corner CollgCr 267.0 70.0
59 60.0 7200 Reg Inside CollgCr 267.0 70.0
64 NaN 9375 Reg Inside CollgCr 267.0 70.0
67 72.0 10665 IR1 Inside CollgCr 267.0 70.0
89 60.0 8070 Reg Inside CollgCr 267.0 70.0
94 69.0 9337 IR1 Inside CollgCr 267.0 70.0
96 78.0 10264 IR1 Inside CollgCr 267.0 70.0
103 94.0 10402 IR1 Corner CollgCr 267.0 70.0
112 77.0 9965 Reg Inside CollgCr 267.0 70.0
119 65.0 8461 Reg Inside CollgCr 267.0 70.0
138 73.0 9066 IR1 Inside CollgCr 267.0 70.0
139 65.0 15426 IR1 Inside CollgCr 267.0 70.0
141 78.0 11645 Reg Inside CollgCr 267.0 70.0
143 78.0 10335 IR1 Inside CollgCr 267.0 70.0
150 120.0 10356 Reg Corner CollgCr 267.0 70.0
192 68.0 9017 IR1 Inside CollgCr 267.0 70.0
194 60.0 7180 IR1 Inside CollgCr 267.0 70.0
203 NaN 4438 Reg Inside CollgCr 267.0 70.0
213 43.0 13568 IR2 CulDSac CollgCr 267.0 70.0
214 NaN 10900 IR1 FR2 CollgCr 267.0 70.0
216 65.0 8450 Reg Inside CollgCr 267.0 70.0
220 73.0 8990 IR1 Inside CollgCr 267.0 70.0
233 75.0 10650 Reg Corner CollgCr 267.0 70.0
... ... ... ... ... ... ... ...
1559 60.0 6048 Reg Corner SWISU 48.0 60.0
1591 67.0 4853 Reg Inside SWISU 48.0 60.0
1592 NaN 7890 Reg Corner SWISU 48.0 60.0
1593 60.0 7200 Reg Inside SWISU 48.0 60.0
1594 51.0 9839 Reg Inside SWISU 48.0 60.0
1896 70.0 6300 Reg Corner SWISU 48.0 60.0
1897 43.0 5707 Reg Inside SWISU 48.0 60.0
2197 60.0 7290 Reg Corner SWISU 48.0 60.0
2198 64.0 7804 Reg Inside SWISU 48.0 60.0
2475 59.0 4484 IR1 Corner SWISU 48.0 60.0
2497 60.0 5400 Reg Inside SWISU 48.0 60.0
2557 60.0 10890 Reg Inside SWISU 48.0 60.0
2805 50.0 5220 IR1 Inside SWISU 48.0 60.0
2806 50.0 5500 Reg Inside SWISU 48.0 60.0
2873 60.0 10890 Reg Inside SWISU 48.0 60.0
2874 58.0 6430 Reg Corner SWISU 48.0 60.0
2875 43.0 7000 Reg Inside SWISU 48.0 60.0
2876 69.0 4899 Reg Corner SWISU 48.0 60.0
2878 84.0 10164 Reg Inside SWISU 48.0 60.0
2879 51.0 6191 Reg Corner SWISU 48.0 60.0
599 24.0 1950 Reg Inside Blueste 10.0 24.0
956 24.0 1300 Reg Inside Blueste 10.0 24.0
1598 35.0 3907 IR1 Inside Blueste 10.0 24.0
1599 35.0 3907 IR1 Inside Blueste 10.0 24.0
1907 24.0 1733 Reg Inside Blueste 10.0 24.0
1908 24.0 1488 Reg Inside Blueste 10.0 24.0
1909 24.0 1612 Reg Inside Blueste 10.0 24.0
2209 24.0 1879 Reg CulDSac Blueste 10.0 24.0
2569 24.0 1782 Reg Inside Blueste 10.0 24.0
2571 35.0 3907 IR1 Inside Blueste 10.0 24.0

2919 rows × 7 columns


In [72]:
idx = df[df.LotFrontage.isnull()].index
df.loc[idx, "LotFrontage"] = df2.loc[idx, "median"]
df[lot_columns].merge(neighborhood, 
                      left_on="Neighborhood", right_index=True)


Out[72]:
LotFrontage LotArea LotShape LotConfig Neighborhood len median
0 65.0 8450 Reg Inside CollgCr 267.0 70.0
2 68.0 11250 IR1 Inside CollgCr 267.0 70.0
13 91.0 10652 IR1 Inside CollgCr 267.0 70.0
22 75.0 9742 Reg Inside CollgCr 267.0 70.0
32 85.0 11049 Reg Corner CollgCr 267.0 70.0
36 112.0 10859 Reg Corner CollgCr 267.0 70.0
43 70.0 9200 IR1 CulDSac CollgCr 267.0 70.0
57 89.0 11645 IR1 Corner CollgCr 267.0 70.0
59 60.0 7200 Reg Inside CollgCr 267.0 70.0
64 70.0 9375 Reg Inside CollgCr 267.0 70.0
67 72.0 10665 IR1 Inside CollgCr 267.0 70.0
89 60.0 8070 Reg Inside CollgCr 267.0 70.0
94 69.0 9337 IR1 Inside CollgCr 267.0 70.0
96 78.0 10264 IR1 Inside CollgCr 267.0 70.0
103 94.0 10402 IR1 Corner CollgCr 267.0 70.0
112 77.0 9965 Reg Inside CollgCr 267.0 70.0
119 65.0 8461 Reg Inside CollgCr 267.0 70.0
138 73.0 9066 IR1 Inside CollgCr 267.0 70.0
139 65.0 15426 IR1 Inside CollgCr 267.0 70.0
141 78.0 11645 Reg Inside CollgCr 267.0 70.0
143 78.0 10335 IR1 Inside CollgCr 267.0 70.0
150 120.0 10356 Reg Corner CollgCr 267.0 70.0
192 68.0 9017 IR1 Inside CollgCr 267.0 70.0
194 60.0 7180 IR1 Inside CollgCr 267.0 70.0
203 70.0 4438 Reg Inside CollgCr 267.0 70.0
213 43.0 13568 IR2 CulDSac CollgCr 267.0 70.0
214 70.0 10900 IR1 FR2 CollgCr 267.0 70.0
216 65.0 8450 Reg Inside CollgCr 267.0 70.0
220 73.0 8990 IR1 Inside CollgCr 267.0 70.0
233 75.0 10650 Reg Corner CollgCr 267.0 70.0
... ... ... ... ... ... ... ...
1559 60.0 6048 Reg Corner SWISU 48.0 60.0
1591 67.0 4853 Reg Inside SWISU 48.0 60.0
1592 60.0 7890 Reg Corner SWISU 48.0 60.0
1593 60.0 7200 Reg Inside SWISU 48.0 60.0
1594 51.0 9839 Reg Inside SWISU 48.0 60.0
1896 70.0 6300 Reg Corner SWISU 48.0 60.0
1897 43.0 5707 Reg Inside SWISU 48.0 60.0
2197 60.0 7290 Reg Corner SWISU 48.0 60.0
2198 64.0 7804 Reg Inside SWISU 48.0 60.0
2475 59.0 4484 IR1 Corner SWISU 48.0 60.0
2497 60.0 5400 Reg Inside SWISU 48.0 60.0
2557 60.0 10890 Reg Inside SWISU 48.0 60.0
2805 50.0 5220 IR1 Inside SWISU 48.0 60.0
2806 50.0 5500 Reg Inside SWISU 48.0 60.0
2873 60.0 10890 Reg Inside SWISU 48.0 60.0
2874 58.0 6430 Reg Corner SWISU 48.0 60.0
2875 43.0 7000 Reg Inside SWISU 48.0 60.0
2876 69.0 4899 Reg Corner SWISU 48.0 60.0
2878 84.0 10164 Reg Inside SWISU 48.0 60.0
2879 51.0 6191 Reg Corner SWISU 48.0 60.0
599 24.0 1950 Reg Inside Blueste 10.0 24.0
956 24.0 1300 Reg Inside Blueste 10.0 24.0
1598 35.0 3907 IR1 Inside Blueste 10.0 24.0
1599 35.0 3907 IR1 Inside Blueste 10.0 24.0
1907 24.0 1733 Reg Inside Blueste 10.0 24.0
1908 24.0 1488 Reg Inside Blueste 10.0 24.0
1909 24.0 1612 Reg Inside Blueste 10.0 24.0
2209 24.0 1879 Reg CulDSac Blueste 10.0 24.0
2569 24.0 1782 Reg Inside Blueste 10.0 24.0
2571 35.0 3907 IR1 Inside Blueste 10.0 24.0

2919 rows × 7 columns


In [73]:
find_null_columns()


Total number of records:  2919
No of columns with null values:  4

Missing values per column:
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
dtype: int64

Fence: Fence quality

We can replace any missing vlues for Fence and MiscFeature with ‘None’ as they probably don’t have this feature with their property.


In [74]:
df.Fence.value_counts()


Out[74]:
MnPrv    329
GdPrv    118
GdWo     112
MnWw      12
Name: Fence, dtype: int64

In [75]:
replace_with(df, "Fence", "None")

In [76]:
find_null_columns()


Total number of records:  2919
No of columns with null values:  3

Missing values per column:
MiscFeature    2814
Alley          2721
FireplaceQu    1420
dtype: int64

MiscFeature


In [77]:
replace_with(df, "MiscFeature", "None")
find_null_columns()


Total number of records:  2919
No of columns with null values:  2

Missing values per column:
Alley          2721
FireplaceQu    1420
dtype: int64

Fireplace

  • Fireplaces: Number of fireplaces
  • FireplaceQu: Fireplace quality. FireplaceQu denotes the fireplace quality.

We can check to see if any of the missing values for FireplaceQu come from houses that recorded having at least 1 fireplace.


In [78]:
fire_columns = find_columns(df, "fire")
fire_columns


Out[78]:
['Fireplaces', 'FireplaceQu']

In [79]:
df.groupby(["FireplaceQu", "Fireplaces"]).Id.count().unstack()


Out[79]:
Fireplaces 1 2 3 4
FireplaceQu
Ex 37.0 5.0 1.0 NaN
Fa 63.0 10.0 1.0 NaN
Gd 627.0 112.0 5.0 NaN
Po 46.0 NaN NaN NaN
TA 495.0 92.0 4.0 1.0

In [80]:
df[df.FireplaceQu.isnull()][fire_columns]


Out[80]:
Fireplaces FireplaceQu
0 0 NaN
5 0 NaN
10 0 NaN
12 0 NaN
15 0 NaN
17 0 NaN
18 0 NaN
19 0 NaN
26 0 NaN
29 0 NaN
30 0 NaN
31 0 NaN
32 0 NaN
36 0 NaN
38 0 NaN
39 0 NaN
42 0 NaN
43 0 NaN
44 0 NaN
47 0 NaN
48 0 NaN
49 0 NaN
50 0 NaN
52 0 NaN
56 0 NaN
57 0 NaN
59 0 NaN
60 0 NaN
61 0 NaN
63 0 NaN
... ... ...
2868 0 NaN
2871 0 NaN
2872 0 NaN
2873 0 NaN
2874 0 NaN
2876 0 NaN
2879 0 NaN
2886 0 NaN
2887 0 NaN
2888 0 NaN
2889 0 NaN
2890 0 NaN
2891 0 NaN
2892 0 NaN
2893 0 NaN
2897 0 NaN
2898 0 NaN
2899 0 NaN
2901 0 NaN
2904 0 NaN
2905 0 NaN
2907 0 NaN
2908 0 NaN
2909 0 NaN
2910 0 NaN
2912 0 NaN
2913 0 NaN
2914 0 NaN
2915 0 NaN
2917 0 NaN

1420 rows × 2 columns

All the houses that have missing values did not record having any fireplaces. We can replace the NA’s with ‘None’ since these houses don’t have any fireplaces at all.


In [81]:
replace_with(df, "FireplaceQu", "None")

In [82]:
find_null_columns()


Total number of records:  2919
No of columns with null values:  1

Missing values per column:
Alley    2721
dtype: int64

Alley: Type of alley access

There are 2721 missing values for Alley and only 2 potential options - Grvl and Pave. We can fill ‘None’ for any of the houses with NA’s as these houses must not have any type of alley access.


In [83]:
replace_with(df, "Alley", "None")
find_null_columns()


Total number of records:  2919
No of columns with null values:  0

Missing values per column:
Series([], dtype: int64)

Congratulations! There is no more missing values

Let's add back the SalePrice column to the dataset. Note for the testing data, there is not SalePrice. So for those, the field will be null.


In [84]:
df.head()


Out[84]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1 60 RL 65.0 8450 Pave None Reg Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706.0 Unf 0.0 150.0 856.0 GasA Ex Y SBrkr 856 854 0 1710 1.0 0.0 2 1 3 1 Gd 8 Typ 0 None Attchd 2003.0 RFn 2.0 548.0 TA TA Y 0 61 0 0 0 0 None None None 0 2 2008 WD Normal
1 2 20 RL 80.0 9600 Pave None Reg Lvl FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978.0 Unf 0.0 284.0 1262.0 GasA Ex Y SBrkr 1262 0 0 1262 0.0 1.0 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.0 RFn 2.0 460.0 TA TA Y 298 0 0 0 0 0 None None None 0 5 2007 WD Normal
2 3 60 RL 68.0 11250 Pave None IR1 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486.0 Unf 0.0 434.0 920.0 GasA Ex Y SBrkr 920 866 0 1786 1.0 0.0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.0 RFn 2.0 608.0 TA TA Y 0 42 0 0 0 0 None None None 0 9 2008 WD Normal
3 4 70 RL 60.0 9550 Pave None IR1 Lvl Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216.0 Unf 0.0 540.0 756.0 GasA Gd Y SBrkr 961 756 0 1717 1.0 0.0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.0 Unf 3.0 642.0 TA TA Y 0 35 272 0 0 0 None None None 0 2 2006 WD Abnorml
4 5 60 RL 84.0 14260 Pave None IR1 Lvl FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655.0 Unf 0.0 490.0 1145.0 GasA Ex Y SBrkr 1145 1053 0 2198 1.0 0.0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.0 RFn 3.0 836.0 TA TA Y 192 84 0 0 0 0 None None None 0 12 2008 WD Normal

In [85]:
df["SalesPrice"] = np.nan
df.loc[0:training.shape[0], "SalesPrice"] = SalesPrice
df.head()


Out[85]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalesPrice
0 1 60 RL 65.0 8450 Pave None Reg Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706.0 Unf 0.0 150.0 856.0 GasA Ex Y SBrkr 856 854 0 1710 1.0 0.0 2 1 3 1 Gd 8 Typ 0 None Attchd 2003.0 RFn 2.0 548.0 TA TA Y 0 61 0 0 0 0 None None None 0 2 2008 WD Normal 208500.0
1 2 20 RL 80.0 9600 Pave None Reg Lvl FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978.0 Unf 0.0 284.0 1262.0 GasA Ex Y SBrkr 1262 0 0 1262 0.0 1.0 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.0 RFn 2.0 460.0 TA TA Y 298 0 0 0 0 0 None None None 0 5 2007 WD Normal 181500.0
2 3 60 RL 68.0 11250 Pave None IR1 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486.0 Unf 0.0 434.0 920.0 GasA Ex Y SBrkr 920 866 0 1786 1.0 0.0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.0 RFn 2.0 608.0 TA TA Y 0 42 0 0 0 0 None None None 0 9 2008 WD Normal 223500.0
3 4 70 RL 60.0 9550 Pave None IR1 Lvl Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216.0 Unf 0.0 540.0 756.0 GasA Gd Y SBrkr 961 756 0 1717 1.0 0.0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.0 Unf 3.0 642.0 TA TA Y 0 35 272 0 0 0 None None None 0 2 2006 WD Abnorml 140000.0
4 5 60 RL 84.0 14260 Pave None IR1 Lvl FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655.0 Unf 0.0 490.0 1145.0 GasA Ex Y SBrkr 1145 1053 0 2198 1.0 0.0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.0 RFn 3.0 836.0 TA TA Y 192 84 0 0 0 0 None None None 0 12 2008 WD Normal 250000.0

In [86]:
df.to_csv("data/kaggle-house-prices/data_combined_cleaned.csv", index = False)

Visualization

Plot the following for train.csv

  • Saleprice  - histogram or boxplot
  • YearBuilt - histogram (better) or frequency plot
  • Saleprice vs YearBuilt - horizontal bar chart
  • Saleprice  vs GarageArea  - scatter chart
  • Median Saleprice by SaleCondition and BldgType - heatmap

In [87]:
xlims = training.SalePrice.min(), training.SalePrice.max()

plt.subplot(2, 1, 1)
training.SalePrice.plot.hist(bins = 50, xlim = xlims, sharex = True, title = "Histogram of SalePrice")
plt.subplot(2, 1, 2)
training.SalePrice.plot.box(vert = False, xlim = xlims, sharex = True, title = "Boxplot of SalePrice")
plt.tight_layout()
plt.xlabel("SalePrice")


Out[87]:
<matplotlib.text.Text at 0x116a69ba8>

In [88]:
training.YearBuilt.plot.hist(bins = 50, title = "Frequency by YearBuilt")


Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x116b19748>

In [89]:
grouped = training.groupby("YearBuilt").SalePrice.median().sort_index()
plt.figure(figsize=(15, 5))
plt.bar(grouped.index, grouped)
plt.xlabel("YearBuilt")
plt.ylabel("Median SalePrice")
plt.title("Median SalePrice by YearBuilt")


Out[89]:
<matplotlib.text.Text at 0x11b13a3c8>

In [90]:
training.plot.scatter("GarageArea", "SalePrice")
plt.title("SalePrice by GarageArea")


Out[90]:
<matplotlib.text.Text at 0x11b4616d8>

In [91]:
import seaborn as sns

In [92]:
grouped = training.groupby(["SaleCondition","BldgType"]).SalePrice.median().unstack()
blues = sns.color_palette("Blues", n_colors=10)
sns.heatmap(grouped, cmap = blues)
plt.title("Median Saleprice \n by SaleCondition and BldgType")


Out[92]:
<matplotlib.text.Text at 0x11d36e208>

In [ ]: